Chargement des packages R

library(ggplot2)
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Analyse en Composantes Principales (ACP)

SeqApiPop

629 échantillons - MAF > 0.01

LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175 bp)

setwd("~/Documents/Stage_NB/data/maf001_LD03")

# fichiers pour SeqApiPop 629 échantillons et filtre maf001
#eigenvec_refpop <- read.table("SeqApiPop_629_maf001_acp.eigenvec", header = F)
#eigenval_refpop <- read.table("SeqApiPop_629_maf001_acp.eigenval", header = F)

# fichiers pour SeqApiPop 629 échantillons et filtre maf001 + LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp)
eigenvec_refpop <- read.table("SeqApiPop_629_maf001_LD03_acp.eigenvec", header = F)
eigenval_refpop <- read.table("SeqApiPop_629_maf001_LD03_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_refpop)[colnames(eigenvec_refpop) == "V2"] <- "name"
eigenvec_refpop_seq_api_labels <- merge(eigenvec_refpop, seq_api_labels, by = "name")

eigen_percent_refpop <- round((eigenval_refpop / (sum(eigenval_refpop) )*100),2)
# Clustering hiérarchique
# Tree
setwd("~/Documents/Stage_NB/data/maf001_LD03")
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD03_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# ACP
# Extraction des 301 individus des populations de référence

eigenvec_refpop_seq_api_labels <- eigenvec_refpop_seq_api_labels[eigenvec_refpop_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_refpop_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_refpop_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_refpop_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_refpop_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_refpop_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_refpop$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
# ACP avec variance expliquée
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.44, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.083, xmax = 0.05, ymin = -0.11, ymax = -0.04)

ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.44, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses de seuil de confiance 0.97 autour des points selon la variable Label
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  geom_text(aes(x = 0.045, y = 0.04, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = 0.05, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = -0.16, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.44, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ellipses avec couleur
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  geom_text(aes(x = 0.045, y = 0.04, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = 0.05, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = -0.16, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.44, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.56, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.04, xmax = -0.02, ymin = -0.16, ymax = -0.06)

LD pruning = 0.2 (fenêtre de 1749 SNPS et pas de 175 bp)

# LD02
setwd("~/Documents/Stage_NB/data/maf001_LD02")

eigenvec_LD02 <- read.table("SeqApiPop_629_maf001_LD02_acp.eigenvec", header = F)
eigenval_LD02 <- read.table("SeqApiPop_629_maf001_LD02_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD02)[colnames(eigenvec_LD02) == "V2"] <- "name"
eigenvec_LD02_seq_api_labels <- merge(eigenvec_LD02, seq_api_labels, by = "name")

eigen_percent_LD02 <- round((eigenval_LD02 / (sum(eigenval_LD02) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD02_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# Extraction des 301 individus des populations de référence
eigenvec_LD02_seq_api_labels <- eigenvec_LD02_seq_api_labels[eigenvec_LD02_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_LD02_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_LD02_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_LD02_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_LD02_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_LD02_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD02$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
ggplot(data = eigenvec_LD02_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.105, xmax = 0.06, ymin = -0.01, ymax = 0.07)

ggplot(data = eigenvec_LD02_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD02_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD02_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_LD02_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.13, xmax = -0.18, ymin = 0.03, ymax = 0.08)

LD pruning = 0.1 (fenêtre de 1749 SNPS et pas de 175 bp)

PC1/PC2
# LD01
setwd("~/Documents/Stage_NB/data/maf001_LD01")

eigenvec_LD01 <- read.table("SeqApiPop_629_maf001_LD01_acp.eigenvec", header = F)
eigenval_LD01 <- read.table("SeqApiPop_629_maf001_LD01_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD01)[colnames(eigenvec_LD01) == "V2"] <- "name"
eigenvec_LD01_seq_api_labels <- merge(eigenvec_LD01, seq_api_labels, by = "name")

eigen_percent_LD01 <- round((eigenval_LD01 / (sum(eigenval_LD01) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD01_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

#heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_LD01_seq_api_labels <- eigenvec_LD01_seq_api_labels[eigenvec_LD01_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                               eigenvec_LD01_seq_api_labels$Label != 'Ariege Conservatory' &
                                                               eigenvec_LD01_seq_api_labels$Label != 'Brittany Conservatory' &
                                                               eigenvec_LD01_seq_api_labels$UniqueInHive != 'Unknown' &
                                                               eigenvec_LD01_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                               eigenvec_LD01_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD01$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

# ACP
ggplot(data = eigenvec_LD01_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.13, xmax = 0.08, ymin = -0.02, ymax = 0.06)

ggplot(data = eigenvec_LD01_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.05, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD01_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.05, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD01_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() +  
  theme(legend.position = c(0.05, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_LD01_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.5, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.09, xmax = 0.135, ymin = -0.05, ymax = 0.01)

LD pruning = 0.05 (fenêtre de 1749 SNPS et pas de 175 bp)

PC1/PC2
# LD005
setwd("~/Documents/Stage_NB/data/maf001_LD005")

eigenvec_LD005 <- read.table("SeqApiPop_629_maf001_LD005_acp.eigenvec", header = F)
eigenval_LD005 <- read.table("SeqApiPop_629_maf001_LD005_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD005)[colnames(eigenvec_LD005) == "V2"] <- "name"
eigenvec_LD005_seq_api_labels <- merge(eigenvec_LD005, seq_api_labels, by = "name")

eigen_percent_LD005 <- round((eigenval_LD005 / (sum(eigenval_LD005) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD005_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_LD005_seq_api_labels <- eigenvec_LD005_seq_api_labels[eigenvec_LD005_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                               eigenvec_LD005_seq_api_labels$Label != 'Ariege Conservatory' &
                                                               eigenvec_LD005_seq_api_labels$Label != 'Brittany Conservatory' &
                                                               eigenvec_LD005_seq_api_labels$UniqueInHive != 'Unknown' &
                                                               eigenvec_LD005_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                               eigenvec_LD005_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD005$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_LD005_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.26, 0.64), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.077, xmax = 0.042, ymin = 0.015, ymax = 0.08)

ggplot(data = eigenvec_LD005_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.26, 0.64), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD005_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.25, 0.64), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD005_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() +  
  theme(legend.position = c(0.25, 0.64), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_LD005_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.06, xmax = 0.11, ymin = -0.06, ymax = 0)

LD pruning = 0.04 (fenêtre de 1749 SNPS et pas de 175 bp)

PC1/PC2
# LD004
setwd("~/Documents/Stage_NB/data/maf001_LD004")

eigenvec_LD004 <- read.table("SeqApiPop_629_maf001_LD004_acp.eigenvec", header = F)
eigenval_LD004 <- read.table("SeqApiPop_629_maf001_LD004_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD004)[colnames(eigenvec_LD004) == "V2"] <- "name"
eigenvec_LD004_seq_api_labels <- merge(eigenvec_LD004, seq_api_labels, by = "name")

eigen_percent_LD004 <- round((eigenval_LD004 / (sum(eigenval_LD004) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD004_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# ACP
#filter 629 -> 301 RefPop
eigenvec_LD004_seq_api_labels <- eigenvec_LD004_seq_api_labels[eigenvec_LD004_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_LD004_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_LD004_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_LD004_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_LD004_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_LD004_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD004$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_LD004_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.32, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.068, xmax = 0.042, ymin = -0.03, ymax = -0.11)

ggplot(data = eigenvec_LD004_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.32, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD004_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.42, 0.06), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD004_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.42, 0.06), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_LD004_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.03, xmax = 0.09, ymin = 0.06, ymax = 0.12)

LD pruning = 0.03 (fenêtre de 1749 SNPS et pas de 175 bp)

PC1/PC2
# LD003
setwd("~/Documents/Stage_NB/data/maf001_LD003")

eigenvec_LD003 <- read.table("SeqApiPop_629_maf001_LD003_acp.eigenvec", header = F)
eigenval_LD003 <- read.table("SeqApiPop_629_maf001_LD003_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD003)[colnames(eigenvec_LD003) == "V2"] <- "name"
eigenvec_LD003_seq_api_labels <- merge(eigenvec_LD003, seq_api_labels, by = "name")

eigen_percent_LD003 <- round((eigenval_LD003 / (sum(eigenval_LD003) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD003_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_LD003_seq_api_labels <- eigenvec_LD003_seq_api_labels[eigenvec_LD003_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_LD003_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_LD003_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_LD003_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_LD003_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_LD003_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD003$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_LD003_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.035, xmax = 0.065, ymin = -0.04, ymax = -0.11)

ggplot(data = eigenvec_LD003_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD003_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() + 
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD003_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() + 
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_LD003_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.05, xmax = 0.1, ymin = 0.08, ymax = 0.16)

LD pruning = 0.01 (fenêtre de 1749 SNPS et pas de 175 bp)

PC1/PC2
# LD001
setwd("~/Documents/Stage_NB/data/maf001_LD001")

eigenvec_LD001 <- read.table("SeqApiPop_629_maf001_LD001_acp.eigenvec", header = F)
eigenval_LD001 <- read.table("SeqApiPop_629_maf001_LD001_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_LD001)[colnames(eigenvec_LD001) == "V2"] <- "name"
eigenvec_LD001_seq_api_labels <- merge(eigenvec_LD001, seq_api_labels, by = "name")

eigen_percent_LD001 <- round((eigenval_LD001 / (sum(eigenval_LD001) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_maf001_LD001_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_LD001_seq_api_labels <- eigenvec_LD001_seq_api_labels[eigenvec_LD001_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_LD001_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_LD001_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_LD001_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_LD001_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_LD001_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_LD001$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_LD001_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.035, xmax = 0.065, ymin = -0.04, ymax = -0.11)

ggplot(data = eigenvec_LD001_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_LD001_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_LD001_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP
ggplot(data = eigenvec_LD001_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.04, xmax = 0, ymin = 0.08, ymax = 0.15)

561 échantillons - MAF > 0.01

LD pruning = 0.3 (fenêtre de 1749 SNPS et pas de 175 bp)

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_maf001_LD03")

eigenvec_refpop <- read.table("SeqApiPop_561_maf001_LD03_acp.eigenvec", header = F)
eigenval_refpop <- read.table("SeqApiPop_561_maf001_LD03_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_refpop)[colnames(eigenvec_refpop) == "V2"] <- "name"
eigenvec_refpop_seq_api_labels <- merge(eigenvec_refpop, seq_api_labels, by = "name")

eigen_percent_refpop <- round((eigenval_refpop / (sum(eigenval_refpop) )*100),2)
# Clustering hiérarchique
# Tree
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_maf001_LD03")
matrice_app_refpop <- read.table("SeqApiPop_561_maf001_LD03_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# Extraction des 301 individus des populations de référence
eigenvec_refpop_seq_api_labels <- eigenvec_refpop_seq_api_labels[eigenvec_refpop_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_refpop_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_refpop_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_refpop_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_refpop_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_refpop_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_refpop$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.43, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.106, xmax = 0.065, ymin = -0.11, ymax = -0.03)

# ellipses autour des points selon Label
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  geom_text(aes(x = 0.05, y = 0.04, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = 0.05, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = -0.16, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  geom_text(aes(x = 0.045, y = 0.04, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = 0.05, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.02, y = -0.16, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c( "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c( "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
# ACP 
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.11, xmax = -0.2, ymin = 0.07, ymax = 0.02)

# ellipses autour des points selon Label
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_refpop_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c( "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c( "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.05, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

629 échantillons - SNPsBeeMuSe filtered

No LD pruning - 10030 SNPS

PC1/PC2
setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

# SNPsBeeMuSe filtrés
eigenvec_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_acp.eigenvec", header = F)
eigenval_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_SNPsBeeMuSe)[colnames(eigenvec_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_SNPsBeeMuSe <- round((eigenval_SNPsBeeMuSe / (sum(eigenval_SNPsBeeMuSe) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_SNPsBeeMuSe_seq_api_labels <- eigenvec_SNPsBeeMuSe_seq_api_labels[eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.035, xmax = 0.065, ymin = -0.04, ymax = -0.11)

#ellipses autour des points selon Label
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.53, 0.7), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.046, xmax = -0.024, ymin = 0.08, ymax = 0.02)

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp) - 3848 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

eigenvec_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_acp.eigenvec", header = F)
eigenval_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_SNPsBeeMuSe)[colnames(eigenvec_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_SNPsBeeMuSe <- round((eigenval_SNPsBeeMuSe / (sum(eigenval_SNPsBeeMuSe) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_SNPsBeeMuSe_seq_api_labels <- eigenvec_SNPsBeeMuSe_seq_api_labels[eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.035, xmax = 0.065, ymin = -0.04, ymax = -0.11)

ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.5, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.025, xmax = 0.05, ymin = 0.02, ymax = 0.11)

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_629_SNPsBeeMuSe")

eigenvec_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_default_acp.eigenvec", header = F)
eigenval_SNPsBeeMuSe <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_default_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_SNPsBeeMuSe)[colnames(eigenvec_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_SNPsBeeMuSe <- round((eigenval_SNPsBeeMuSe / (sum(eigenval_SNPsBeeMuSe) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_629_SNPsBeeMuSe_filtered_maf001_LD03_default_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# Extraction des 301 individus des populations de référence
eigenvec_SNPsBeeMuSe_seq_api_labels <- eigenvec_SNPsBeeMuSe_seq_api_labels[eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.035, xmax = 0.065, ymin = -0.04, ymax = -0.11)

ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

# ellipses autour des points selon Label
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.5, 0.04), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.5, 0.04), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.5, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.025, xmax = 0.05, ymin = 0.02, ymax = 0.11)

561 échantillons - SNPsBeeMuSe filtered

No LD pruning - 10030 SNPS

PC1/PC2
setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe")

# SNPsBeeMuSe filtrés
eigenvec_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_acp.eigenvec", header = F)
eigenval_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_SNPsBeeMuSe)[colnames(eigenvec_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_SNPsBeeMuSe <- round((eigenval_SNPsBeeMuSe / (sum(eigenval_SNPsBeeMuSe) )*100),2)

# Clustering hiérarchique
# Tree
matrice_app_refpop <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_acp.rel", header = FALSE)

dist_matrice_refpop <- dist(matrice_app_refpop)
hc_refpop <- hclust(dist_matrice_refpop, method = "ward.D2")
plot(hc_refpop)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')

# Extraction des 301 individus des populations de référence
eigenvec_SNPsBeeMuSe_seq_api_labels <- eigenvec_SNPsBeeMuSe_seq_api_labels[eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                 eigenvec_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label2 <- c("black", "mediumpurple4", "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.039, xmax = 0.077, ymin = -0.04, ymax = -0.12)

#ellipses autour des points selon Label
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label2, 
                    breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  scale_y_reverse() +  
  theme(legend.position = c(0.34, 0.02), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1, "lines"),  
        legend.text = element_text(size = 10))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label2, 
                     breaks = c("Ouessant Conservatory", "Colonsay Conservatory", "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Mellifera Ouessant", "Mellifera Colonsay", "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_y_reverse() +  
  theme(legend.position = c(0.53, 0.7), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.046, xmax = -0.024, ymin = 0.08, ymax = 0.02)

MAF > 0.01 - LD pruning = 0.3 (fenêtre de 1749 SNPs et pas de 175 bp) - 3848 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuse_LD03")

eigenvec_561_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_acp.eigenvec", header = F)
eigenval_561_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_561_SNPsBeeMuSe)[colnames(eigenvec_561_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_561_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_561_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_561_SNPsBeeMuSe <- round((eigenval_561_SNPsBeeMuSe / (sum(eigenval_561_SNPsBeeMuSe) )*100),2)
# Clustering hiérarchique
# Tree
  setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuse_LD03")
matrice_app_561_default <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_acp.rel", header = FALSE)

dist_matrice_561_default <- dist(matrice_app_561_default)
hc_561_d <- hclust(dist_matrice_561_default, method = "ward.D2")
plot(hc_561_d)

#h eatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# Extraction des 301 individus des populations de référence
eigenvec_561_SNPsBeeMuSe_seq_api_labels <- eigenvec_561_SNPsBeeMuSe_seq_api_labels[eigenvec_561_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label <- c("mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_561_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.085, xmax = 0.045, ymin = -0.12, ymax = -0.05)

PC3/PC4
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.48, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.025, xmax = 0.05, ymin = -0.04, ymax = -0.12)

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe_LD_default")

eigenvec_561_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned_acp.eigenvec", header = F)
eigenval_561_SNPsBeeMuSe <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned_acp.eigenval", header = F)

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

colnames(eigenvec_561_SNPsBeeMuSe)[colnames(eigenvec_561_SNPsBeeMuSe) == "V2"] <- "name"
eigenvec_561_SNPsBeeMuSe_seq_api_labels <- merge(eigenvec_561_SNPsBeeMuSe, seq_api_labels, by = "name")

eigen_percent_561_SNPsBeeMuSe <- round((eigenval_561_SNPsBeeMuSe / (sum(eigenval_561_SNPsBeeMuSe) )*100),2)
# Clustering hiérarchique
# Tree
  setwd("~/Documents/Stage_NB/data/SeqApiPop_561_SNPsBeeMuSe_LD_default")
matrice_app_561_default <- read.table("SeqApiPop_561_SNPsBeeMuSe_filtered_maf001_LD03_default_pruned_acp.rel", header = FALSE)

dist_matrice_561_default <- dist(matrice_app_561_default)
hc_561_d <- hclust(dist_matrice_561_default, method = "ward.D2")
plot(hc_561_d)

# heatmap
#heatplot(as.matrix(dist(matrice_app_refpop,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# Extraction des 301 individus des populations de référence
eigenvec_561_SNPsBeeMuSe_seq_api_labels <- eigenvec_561_SNPsBeeMuSe_seq_api_labels[eigenvec_561_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_561_SNPsBeeMuSe_seq_api_labels$GeneticOrigin != 'Buckfast', ]

custom_colors_label <- c("mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen")

lambda <- eigenval_561_SNPsBeeMuSe$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
PC1/PC2
# ACP avec variance expliquée
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.085, xmax = 0.045, ymin = -0.12, ymax = -0.05)

# ellipses autour des points selon Label
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  geom_text(aes(x = 0.05, y = 0.04, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = 0.05, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = -0.15, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  geom_text(aes(x = 0.045, y = 0.04, label = "M-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = 0.05, label = "C-lineage"), size = 4, color = "black") +
  geom_text(aes(x = -0.01, y = -0.15, label = "O-lineage"), size = 4, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC1", y = "PC2") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label, 
                    breaks = c( "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c( "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.45, 0.03), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

PC3/PC4
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.03, xmax = 0.055, ymin = -0.05, ymax = -0.15)

# ellipses autour des points selon Label
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  theme(legend.position = c(0.45, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2)) 

# ellipses avec couleur
ggplot(data = eigenvec_561_SNPsBeeMuSe_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point() +
  stat_ellipse(aes(group = Label, fill = Label), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot - reference populations", x = "PC3", y = "PC4") +
  scale_color_manual(values = custom_colors_label, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +
  scale_fill_manual(values = custom_colors_label, 
                    breaks = c( "Iberiensis Spain", 
                               "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                    labels = c( "Iberiensis Spain", 
                               "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                               "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                               "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) +  
  theme(legend.position = c(0.45, 0.05), legend.justification = c(0, 0),
        legend.background = element_rect(fill = "transparent"),
        legend.key.size = unit(1.2, "lines"),  
        legend.text = element_text(size = 11))  + 
  guides(color = guide_legend(override.aes = list(size = 3.5), ncol = 2))

BeeMuSe

748 échantillons - SNPsBeeMuSe filtered - 10256 SNPs

# Chargement des données
eigenvec <- read.table("BeeMuse_filtered.eigenvec", header = F) 
eigenval <- read.table("BeeMuse_filtered.eigenval", header = F) 

colnames(eigenvec)[colnames(eigenvec) == "V1"] <- "Sample"
colnames(eigenvec)[colnames(eigenvec) == "V2"] <- "Filename"

Corres_ID_E756 <- read.table("/home/nbettembourg/Documents/Stage_NB/data/BeeMuSe_Wageningen_E756_17_03_2023_Alain/BeeMuSe_Wageningen_E756_17_03_2023_Alain/Corres_ID_E756.txt",header=F)
input_pedigree_BeeMuSe <- read.table("/home/nbettembourg/Documents/Stage_NB/data/BeeMuSe_Wageningen_E756_17_03_2023_Alain/BeeMuSe_Wageningen_E756_17_03_2023_Alain/input-pedigree_BeeMuSe.csv",header=T,sep=";")
# Correspondance pedigree - échantillons BeeMuSe
# extraire 'Pool' et '-100' adin d'obtenir 'Pool-100'
eigenvec$name <- paste(sub("Beemuse_", "", eigenvec$Sample), 
                 sub("_(.*?)\\..*", "\\1", eigenvec$Filename), 
                 sep = "-")
eigenvec$name <- str_extract(eigenvec$name, "[A-Za-z0-9]+-[0-9]+")

colnames(Corres_ID_E756)[colnames(Corres_ID_E756) == "V1"] <- "name"
colnames(Corres_ID_E756)[colnames(Corres_ID_E756) == "V2"] <- "ID_1a"

Corres_ID_E756$ID_1a <- gsub("o", "_", Corres_ID_E756$ID_1a)

# Remplacement de tous les "o" par "_" sauf quand présent dans un mot
Corres_ID_E756$ID_1a <- gsub("Pers_", "Perso", Corres_ID_E756$ID_1a)
Corres_ID_E756$ID_1a <- gsub("L_c", "Loc", Corres_ID_E756$ID_1a)

Corres_ID_E756_eigenvec <- merge(eigenvec, Corres_ID_E756,  by = 'name')
merged_3 <- merge(Corres_ID_E756_eigenvec, input_pedigree_BeeMuSe,  by = 'ID_1a')
colnames(merged_3)[colnames(merged_3) == "V3.x"] <- "V3"

ind2pop_ID_2a = subset(merged_3, select = c(name, ID_2a))

id_counts <- as.data.frame(table(merged_3$ID_2a))
names(id_counts) <- c("ID_2a", "Occurrence")
# Importation des données de matrice d'apparentement dans R et visualisation par clustering hiérarchique
matrice_app <- read.table("plink2.rel", header = FALSE)
dist_matrice <- dist(matrice_app)
hc <- hclust(dist_matrice, method = "ward.D2")
plot(hc)

#heatplot(as.matrix(dist(matrice_app,diag=T)), cols.default = F, lowcol = 'blue', highcol='yellow', dualScale = F, scale='none', method='ward.D2')
# Proportion de la variance expliquée
eigen_percent <- round((eigenval / (sum(eigenval) )*100),2)

lambda <- eigenval$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

PC1/PC2

  # ACP - Sample : plaque de provenance de génotypage des échantillons
ggplot(eigenvec, aes(x = V3, y = V4, label = Sample, color = Sample)) +
  geom_point() +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right")  

  # ACP - ID_breeder : identification de l'apiculteur de chez qui proviennent les abeilles
ggplot(merged_3, aes(x = V3, y = V4, label = ID_breeder, color = ID_breeder)) +
  geom_point() +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") 

  # ACP - pedigree : ID_2a (même reine mère des reines génotypées)
# I - même couleur et forme des points pour les 29 groupes

ggplot(merged_3, aes(x = V3, y = V4, label = ID_2a)) +
  geom_point(size = 1, color = "red") + 
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.1, xmax = -0.16, ymin = 0.05, ymax = -0.01)

# II - couleurs différentes et même forme de points pour les 29 groupes

ggplot(merged_3, aes(x = V3, y = V4, label = ID_2a, color = ID_2a)) +
  geom_point(size = 1) +  
  scale_color_manual(values = c("#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                                 "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                                 "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                                 "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                                 "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                                 "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")) +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.1, xmax = -0.16, ymin = 0.05, ymax = -0.01)

# III - couleurs et formes différentes, pour mieux distinguer les 29 groupes

values_color <- c("#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                  "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                  "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                  "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                  "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                  "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

values_shape <- c(16, 15, 17, 18, 0, 1, 2, 6, 5, 3, 4, 9, 8, 16, 15, 17, 18, 0, 1, 2, 6, 5, 3, 4, 9, 8, 16, 15, 17)

ggplot(merged_3, aes(x = V3, y = V4, label = ID_2a, color = ID_2a, shape = ID_2a)) +
  geom_point(size = 1.5) +
  scale_color_manual(values = values_color) +
  scale_shape_manual(values = values_shape) +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.1, xmax = -0.16, ymin = 0.05, ymax = -0.01)

# Visualisation ACP PC1 PC2 avec ellipses de seuil de confiance de 0.97
ggplot(merged_3, aes(x = V3, y = V4, label = ID_2a, color = ID_2a)) +
  geom_point(size = 0.8) +  
  stat_ellipse(aes(group = ID_2a), geom = "polygon", level = 0.97, alpha = 0, size = 0.1, color = "black") +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") 
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

# 15 familles bien regroupés entre elles
ggplot(subset(merged_3, ID_2a %in% c("BAH_20-19", "BBS_6-19","BER_11-19", "BLS_53-19","BHA_2-20",  "KLSU_14-19", "MM_31-20", "MP_10-20", "PersoLD_2021", "PersoLD_2022", "S_GZ_2-19", "SJ_16-20", "SJ_24-20", "TL_13-20", "TL_19-20")), aes(x = V3, y = V4, color = ID_2a)) +
  geom_point(size = 1) +
  stat_ellipse(aes(group = ID_2a, fill = ID_2a), geom = "polygon", level = 0.97, alpha = 0.15, size = 0.15, color = "black") +
  #stat_ellipse(aes(group = ID_2a), geom = "polygon", level = 0.97, alpha = 0, size = 0.1, color = "black") +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right")
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

# 14 familles moins bien regroupées entre elles
ggplot(subset(merged_3, ID_2a %in% c( "BH_44","BH_7-19", "ER_13-19","MM_37-20", "KBJ_1-19", "KLoc_37-19","KBru_6-20", "PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoUB_2021", "PersoUB_2022", "SJ_30-20","SBJ_3-19")), aes(x = V3, y = V4, color = ID_2a)) +
  geom_point(size = 1) +
  stat_ellipse(aes(group = ID_2a, fill = ID_2a), geom = "polygon", level = 0.97, alpha = 0.15, size = 0.15, color = "black") +
  #stat_ellipse(aes(group = ID_2a), geom = "polygon", level = 0.97, alpha = 0, size = 0.1, color = "black") +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right")

  # ACP - Sample (plaques)
ggplot(merged_3, aes(x = V3, y = V4, label = Sample, color = Sample)) +
  geom_point() +
 # stat_ellipse(aes(group = Sample), geom = "polygon", level = 0.97, alpha = 0, size = 0.2, color = "black") +
  # stat_ellipse(aes(group = Sample, fill = Sample), geom = "polygon", level = 0.97, alpha = 0.2, size = 0.2, color = "black") +
  labs(title = "PCA Plot", x = "PC1", y = "PC2") +
  theme(legend.position = "right") 

# 748 -> 681 -> 612 échantillons
values_in_A_not_in_B <- eigenvec$name[!(eigenvec$name %in% merged_3$name)]
length(values_in_A_not_in_B) 
## [1] 136

PC3/PC4

 # ACP - ID_2a : reines mères des reines génotypées
# I - même couleur et forme pour les 29 groupes

ggplot(merged_3, aes(x = V5, y = V6, label = ID_2a)) +
  geom_point(size = 1, color = "red") +  
  labs(title = "PCA Plot - BeeMuSe", x = "PC3", y = "PC4") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.055, xmax = -0.09, ymin = -0.04, ymax = -0.1)

# II - couleurs différentes et même forme pour les 29 groupes

ggplot(merged_3, aes(x = V5, y = V6, label = ID_2a, color = ID_2a)) +
  geom_point(size = 1) + 
  scale_color_manual(values = c("#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                                 "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                                 "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                                 "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                                 "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                                 "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")) +
  labs(title = "PCA Plot - BeeMuSe", x = "PC3", y = "PC4") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.055, xmax = -0.09, ymin = -0.04, ymax = -0.1)

# 15 bien regroupés
ggplot(subset(merged_3, ID_2a %in% c("BAH_20-19", "BBS_6-19","BER_11-19", "BLS_53-19","BHA_2-20",  "KLSU_14-19", "MM_31-20", "MP_10-20", "PersoLD_2021", "PersoLD_2022", "S_GZ_2-19", "SJ_16-20", "SJ_24-20", "TL_13-20", "TL_19-20")), aes(x = V5, y = V6, color = ID_2a)) +
  geom_point(size = 1) +
  stat_ellipse(aes(group = ID_2a, fill = ID_2a), geom = "polygon", level = 0.97, alpha = 0.15, size = 0.15, color = "black") +
  #stat_ellipse(aes(group = ID_2a), geom = "polygon", level = 0.97, alpha = 0, size = 0.1, color = "black") +
  labs(title = "PCA Plot - BeeMuSe", x = "PC3", y = "PC4") +
  theme(legend.position = "right")
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

# 14 moins bien regroupées entre elles
ggplot(subset(merged_3, ID_2a %in% c( "BH_44","BH_7-19", "ER_13-19","MM_37-20", "KBJ_1-19", "KLoc_37-19","KBru_6-20", "PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoUB_2021", "PersoUB_2022", "SJ_30-20","SBJ_3-19")), aes(x = V5, y = V6, color = ID_2a)) +
  geom_point(size = 1) +
 stat_ellipse(aes(group = ID_2a, fill = ID_2a), geom = "polygon", level = 0.97, alpha = 0.15, size = 0.15, color = "black") +
  #stat_ellipse(aes(group = ID_2a), geom = "polygon", level = 0.97, alpha = 0, size = 0.1, color = "black") +
  labs(title = "PCA Plot - BeeMuSe", x = "PC3", y = "PC4") +
  theme(legend.position = "right")
## Warning in MASS::cov.trob(data[, vars]): Probable convergence failure

# III - couleurs et formes différentes, pour mieux distinguer les 29 groupes

values_color <- c("#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                  "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                  "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                  "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                  "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                  "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

values_shape <- c(16, 15, 17, 18, 0, 1, 2, 6, 5, 3, 4, 9, 8, 16, 15, 17, 18, 0, 1, 2, 6, 5, 3, 4, 9, 8, 16, 15, 17)

ggplot(merged_3, aes(x = V5, y = V6, label = ID_2a, color = ID_2a, shape = ID_2a)) +
  geom_point(size = 1.5) +
  scale_color_manual(values = values_color) +
  scale_shape_manual(values = values_shape) +
  labs(title = "PCA Plot", x = "PC3", y = "PC4") +
  theme(legend.position = "right") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = -0.055, xmax = -0.09, ymin = -0.04, ymax = -0.1)

Correspondance ID_2a - fichier .fam

  #  BEEMUSE - ID_2a
beemuse_samples_ID_2a <- merged_3[, c("Sample", "Filename", "ID_2a")]
nom_fichier <- "beemuse_samples_ID_2a.txt"
write.table(beemuse_samples_ID_2a, file = nom_fichier, sep = "\t", row.names = FALSE,quote=FALSE)
  # FAM MERGED SEQAPIPOP BEEMUSE 561
merged_fam <- read.table("merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD_default.fam", sep = " ", stringsAsFactors = FALSE)

merged_fam <- merged_fam[, 1:2]
colnames(merged_fam)[colnames(merged_fam) == "V1"] <- "Sample"
colnames(merged_fam)[colnames(merged_fam) == "V2"] <- "Filename"
# Créer une nouvelle colonne dans le fichier 2 pour stocker les valeurs de la troisième colonne
merged_fam$ID_2a <- NA

# Parcourir les lignes du fichier 2
for (i in 1:nrow(merged_fam)) {
    # Vérifier si les valeurs des colonnes Sample et Filename du fichier 2 sont présentes dans le fichier 1
    correspondance <- beemuse_samples_ID_2a$ID_2a[beemuse_samples_ID_2a$Sample == merged_fam$Sample[i] & beemuse_samples_ID_2a$Filename == merged_fam$Filename[i]]
    
    # Si une correspondance exacte est trouvée, ajouter la valeur correspondante de la colonne "ID_2a" du fichier 1 au fichier 2
    if (length(correspondance) > 0) {
        merged_fam$ID_2a[i] <- as.character(correspondance[1]) # Utilisez la correspondance trouvée
    } else {
        # Sinon, ajouter "Beemuse" dans la troisième colonne du fichier 2
        merged_fam$ID_2a[i] <- "Beemuse"
    }
}

# Créer fichier de correspondance entre les échantillons de BeeMuSe ID_2a avec le fichier .fam
correspondance <- merged_fam[, c("Sample","Filename","ID_2a")]
nom_fichier <- "correspondance_samples.txt"
write.table(correspondance, file = nom_fichier, sep = "\t", row.names = FALSE,quote=FALSE)

Merged Data

748 échantillons BeeMuSe - 561 échantillons SeqApiPop

MAF > 0.01 - LD pruning = 0.1 (fenêtre de 50 SNPs et pas de 10 bp) - 1055 SNPs

setwd("~/Documents/Stage_NB/data/merged_data_1055_561_not_supervised")

eigenvec_merged_maf001_LD03 <- read.table("merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD_default_acp.eigenvec", header = F) 
eigenval_merged_maf001_LD03 <- read.table("merged_BeeMuSe_SeqApiPop_561_filtered_maf001_LD_default_acp.eigenval", header = F) 

colnames(eigenvec_merged_maf001_LD03)[colnames(eigenvec_merged_maf001_LD03) == "V1"] <- "Sample"
colnames(eigenvec_merged_maf001_LD03)[colnames(eigenvec_merged_maf001_LD03) == "V2"] <- "name"

Corres_ID_E756 <- read.table("/home/nbettembourg/Documents/Stage_NB/data/BeeMuSe_Wageningen_E756_17_03_2023_Alain/BeeMuSe_Wageningen_E756_17_03_2023_Alain/Corres_ID_E756.txt",header=F)
input_pedigree_BeeMuSe <- read.table("/home/nbettembourg/Documents/Stage_NB/data/BeeMuSe_Wageningen_E756_17_03_2023_Alain/BeeMuSe_Wageningen_E756_17_03_2023_Alain/input-pedigree_BeeMuSe.csv",header=T,sep=";")

seq_api_labels <- read.csv("~/Documents/Stage_NB/data/SeqApiPop_labels.csv")

eigenvec_merged_maf001_LD03_seq_api_labels <- merge(eigenvec_merged_maf001_LD03, seq_api_labels, by = "name")

eigen_percent <- round((eigenval_merged_maf001_LD03 / (sum(eigenval_merged_maf001_LD03) )*100),2)
lambda <- eigenval_merged_maf001_LD03$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)
# Extraction des 301 individus des populations de référence
eigenvec_merged_maf001_LD03_seq_api_labels <- eigenvec_merged_maf001_LD03_seq_api_labels[eigenvec_merged_maf001_LD03_seq_api_labels$GeneticOrigin != 'Unknown' &
                                                                   eigenvec_merged_maf001_LD03_seq_api_labels$Label != 'Ariege Conservatory' &
                                                                   eigenvec_merged_maf001_LD03_seq_api_labels$Label != 'Brittany Conservatory' &
                                                                   eigenvec_merged_maf001_LD03_seq_api_labels$UniqueInHive != 'Unknown' &
                                                                   eigenvec_merged_maf001_LD03_seq_api_labels$UniqueInHive != 'Buckfast' &
                                                                   eigenvec_merged_maf001_LD03_seq_api_labels$GeneticOrigin != 'Buckfast', ]

# Filtrer les lignes où les facteurs des colonnes Sample et name sont différents
eigenvec_BeeMuSe_748_Samples <- eigenvec_merged_maf001_LD03[as.character(eigenvec_merged_maf001_LD03$Sample) != as.character(eigenvec_merged_maf001_LD03$name), ]
# extraire 'Pool' et '-100' obtenir 'Pool-100'
eigenvec_merged_maf001_LD03$name <- paste(sub("Beemuse_", "", eigenvec_merged_maf001_LD03$Sample), 
                 sub("_(.*?)\\..*", "\\1", eigenvec_merged_maf001_LD03$name), 
                 sep = "-")
eigenvec_merged_maf001_LD03$name <- str_extract(eigenvec_merged_maf001_LD03$name, "[A-Za-z0-9]+-[0-9]+")

colnames(Corres_ID_E756)[colnames(Corres_ID_E756) == "V1"] <- "name"
colnames(Corres_ID_E756)[colnames(Corres_ID_E756) == "V2"] <- "ID_1a"

Corres_ID_E756$ID_1a <- gsub("o", "_", Corres_ID_E756$ID_1a)

# Remplacement de "o" par "_" sauf quand présent dans un mot
Corres_ID_E756$ID_1a <- gsub("Pers_", "Perso", Corres_ID_E756$ID_1a)
Corres_ID_E756$ID_1a <- gsub("L_c", "Loc", Corres_ID_E756$ID_1a)

Corres_ID_E756_eigenvec <- merge(eigenvec_merged_maf001_LD03, Corres_ID_E756,  by = 'name')
eigenvec_merged_Corres_ID_E756_pedigree <- merge(Corres_ID_E756_eigenvec, input_pedigree_BeeMuSe,  by = 'ID_1a')
colnames(eigenvec_merged_Corres_ID_E756_pedigree)[colnames(eigenvec_merged_Corres_ID_E756_pedigree) == "V3.x"] <- "V3"
PC1/PC2
# ACP - merged data
lambda <- eigenval_merged_maf001_LD03$V1
variance_proportion <- lambda / sum(lambda)
variance_df <- data.frame(PC = seq_along(variance_proportion), Variance = variance_proportion)

ggplot(eigenvec_merged_maf001_LD03, aes(x = V3, y = V4)) +
  geom_point() +
  scale_x_reverse() + 
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.065, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# ACP plot - BeeMuSe 748 & SeqApiPop 301 RefPop
# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = "SeqApiPop")) +
  geom_point() +
  scale_x_reverse() 

# Ajout ACP pour BeeMuSe
plot2 <- plot1 + geom_point(data = eigenvec_BeeMuSe_748_Samples, aes(x = V3, y = V4, color = "BeeMuSe")) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop",x = "PC1", y = "PC2")

# Ajout de la légende
plot2 + scale_color_manual(name = "Dataset", 
                            values = c(SeqApiPop = "black", BeeMuSe = "red"),
                            labels = c(SeqApiPop = "SeqApiPop 301 RefPop", BeeMuSe = "BeeMuSe 748 Samples")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# ACP plot - BeeMuSe 612 Pedigree & SeqApiPop 301 RefPop
# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = "SeqApiPop")) +
  geom_point() +
  scale_x_reverse()

# Ajout ACP pour BeeMuSe
plot2 <- plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V3, y = V4, color = "BeeMuSe")) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop",x = "PC1", y = "PC2")

# Ajout de la légende
plot2 + scale_color_manual(name = "Dataset", 
                            values = c(SeqApiPop = "black", BeeMuSe = "red"),
                            labels = c(SeqApiPop = "SeqApiPop 301 RefPop", BeeMuSe = "BeeMuSe 612 Pedigree")) +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# Couleurs 3 lignées - Label
custom_colors_2 <- c( "black", "black", "black", "black", "orange", "orange", "orange", "orange", "orange", "orange", "mediumseagreen", "red")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point(size = 1) +
  scale_x_reverse() +
  #scale_y_reverse() +
  scale_color_manual(values = custom_colors_2, 
                     breaks = c("Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) 

# Ajout ACP pour BeeMuSe
plot1 + geom_point(data = eigenvec_BeeMuSe_748_Samples, aes(x = V3, y = V4), color = "red", size =1) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# Couleurs des 11 populations de référence - Label
custom_colors <- c( "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen", "red")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point(size=1) +
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) 

# Ajout ACP pour BeeMuSe
plot1 + geom_point(data = eigenvec_BeeMuSe_748_Samples, aes(x = V3, y = V4), color = "red", size=1) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# SeqApiPop - 301 Samples - BeeMuSe 612 Samples - ID_2a - colors
custom_colors <- c( "black", "black", "black", 
                   "black", "black", "black", "black", "black", "black", "black", 
                   "black", 
                   "#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                   "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                   "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                   "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                   "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                   "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point(size=1) +
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c( "Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France", "BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", "KBJ_1-19", "KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia","BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", 
 "KBJ_1-19","KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20")) +  
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  theme(legend.position = "right")

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V3, y = V4, color = ID_2a),size=1) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# SeqApiPop - 301 Samples - Label - colors - 3 lignées
custom_colors <- c( "black", "black", "black", 
                   "black", "orange", "orange", "orange", "orange", "orange", "orange", 
                   "mediumseagreen", 
                   "#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                   "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                   "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                   "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                   "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                   "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point(size=1) +
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c("Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France", "BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", "KBJ_1-19", "KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia","BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", 
 "KBJ_1-19","KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20")) +  
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  theme(legend.position = "right")

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V3, y = V4, color = ID_2a),size=1) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

# SeqApiPop - 301 - Label - couleurs - 11 populations de référence
custom_colors <- c("mediumpurple3", "lightskyblue", "mediumvioletred", 
                   "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", 
                   "mediumseagreen", 
                   "#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                   "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                   "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                   "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                   "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                   "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V3, y = V4, color = Label)) +
  geom_point(size = 1) +  
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c("Iberiensis Spain", "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France", "BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", "KBJ_1-19", "KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia","BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", 
 "KBJ_1-19","KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20")) +  
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  theme(legend.position = "right")

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V3, y = V4, color = ID_2a), size = 1) + 
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC1", y = "PC2") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.075, xmax = 0.04, ymin = -0.1, ymax = -0.05)

PC3/PC4
# ACP - PC3 / PC4

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point(size = 1) + 
  scale_x_reverse() +
  scale_color_manual(values = custom_colors_2, 
                     breaks = c("Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) 

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_BeeMuSe_748_Samples, aes(x = V5, y = V6), color = "red", size = 1) + 
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC3", y = "PC4") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.097, xmax = 0.034, ymin = -0.1, ymax = -0.05)

# Couleurs 11 populations de référence
custom_colors <- c( "mediumpurple3", "lightskyblue", "mediumvioletred", "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", "mediumseagreen", "red")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point(size = 1) +
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c("Iberiensis Spain", 
                                "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France"),
                     labels = c( "Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia")) 

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_BeeMuSe_748_Samples, aes(x = V5, y = V6), color = "red", size = 1) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC3", y = "PC4") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.097, xmax = 0.034, ymin = -0.1, ymax = -0.05)

# Couleurs des 11 populations de référence SeqApiPop + 29 familles ID_2a 
custom_colors <- c("mediumpurple3", "lightskyblue", "mediumvioletred", 
                   "hotpink1", "yellow", "gold", "orange", "chocolate", "brown", "olivedrab3", 
                   "mediumseagreen", 
                   "#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                   "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                   "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                   "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                   "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                   "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point(size = 1) +  
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c("Iberiensis Spain", "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France", "BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", "KBJ_1-19", "KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"),
                     labels = c("Iberiensis Spain", 
                                "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia","BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", 
 "KBJ_1-19","KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20")) +  
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC3", y = "PC4") +
  theme(legend.position = "right")

# Ajouter ACP BeeMuSe
plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V5, y = V6, color = ID_2a), size = 1) + 
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC3", y = "PC4") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.097, xmax = 0.034, ymin = -0.1, ymax = -0.05)

# Couleurs 3 lignées SeqApiPop + 29 familles ID_2a 
custom_colors <- c( "black", "black", "black", 
                   "black", "orange", "orange", "orange", "orange", "orange", "orange", 
                   "mediumseagreen", 
                   "#FF0000FF", "#FF3300FF", "#FF6600FF", "#FF9900FF", "#FFCC00FF", 
                   "#FFFF00FF", "#CCFF00FF", "#99FF00FF", "#66FF00FF", "#33FF00FF", 
                   "#00FF00FF", "#00FF33FF", "#00FF66FF", "#00FF99FF", "#00FFCCFF", 
                   "#00FFFFFF", "#00CCFFFF", "#0099FFFF", "#0066FFFF", "#0033FFFF", 
                   "#0000FFFF", "#3300FFFF", "#6600FFFF", "#9900FFFF", "#CC00FFFF", 
                   "#FF00FFFF", "#FF00CCFF", "#FF0099FF", "#FF0066FF")

# ACP SeqApiPop
plot1 <- ggplot(eigenvec_merged_maf001_LD03_seq_api_labels, aes(x = V5, y = V6, color = Label)) +
  geom_point(size = 1) +
  scale_x_reverse() +
  scale_color_manual(values = custom_colors, 
                     breaks = c( "Iberiensis Spain", "Savoy Conservatory", "Porquerolles Conservatory", "Sollies Conservatory", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia France", "BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", "KBJ_1-19", "KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"),
                     labels = c( "Iberiensis Spain", "Mellifera Savoy", "Mellifera Porquerolles", "Mellifera Sollies", 
                                "Ligustica Italy", "Carnica Slovenia", "Carnica Germany", 
                                "Carnica Switzerland", "Carnica France", "Carnica Poland", "Caucasia","BAH_20-19", "BBS_6-19", "BER_11-19", "BH_44", "BH_7-19", "BHA_2-20", "BLS_53-19", "ER_13-19", 
 "KBJ_1-19","KBru_6-20", "KLoc_37-19", "KLSU_14-19", "MM_31-20", "MM_37-20", "MP_10-20", 
"PersoBC_2021", "PersoJLL_2021", "PersoJLL_2022", "PersoLD_2021", "PersoLD_2022", "PersoUB_2021", 
"PersoUB_2022", "S_GZ_2-19", "SBJ_3-19", "SJ_16-20", "SJ_24-20", "SJ_30-20", "TL_13-20", 
"TL_19-20"))

# Ajout ACP BeeMuSe
plot1 + geom_point(data = eigenvec_merged_Corres_ID_E756_pedigree, aes(x = V5, y = V6, color = ID_2a), size = 1 ) +
  labs(title = "PCA Plot - BeeMuSe & SeqApiPop", x = "PC3", y = "PC4") +
  annotation_custom(
    ggplotGrob(
      ggplot(variance_df, aes(x = factor(PC), y = Variance)) +
        geom_bar(stat = "identity", fill = "steelblue", width = 0.8) +
        theme_minimal() +
        theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
    ), xmin = 0.097, xmax = 0.034, ymin = -0.1, ymax = -0.05)